# Get crime data from ArcGIS API and remove (4) entries with missing geometry
# Write cleaned data to GeoJSON file
# crime_dg <- st_read("https://utility.arcgis.com/usrsvcs/servers/3adad6320b7a421bb3826ec8871e2b66/rest/services/OpenData_PublicSafety/MapServer/2/query?outFields=*&where=1%3D1&f=geojson")
# crime_dg$Date <- as.Date(fread(".//data//crime_dg.csv")$Date, "%Y/%m/%d")
# crime_dg <- crime_dg[!st_is_empty(crime_dg),]
# st_write(crime_dg, ".//data//crime_dg.geojson")
# Read in Hartford crime and parcel data
# c <- st_read(".//data//crime_dg.geojson")
# p <- st_read(".//data//parcel_hartford.geojson")
# Filter parcels for single family homes only
# Entire apartment or residential complexes are bought less frequently
# and are more complicated to appraise.
# resid_labels <- c("ONE FAMILY") #, "TWO FAMILY", "THREE FAMILY")
# pr <- p[p$State_Use_Description %in% resid_labels, ]
# Filter crimes from 2016-2021
# Property appraisal was made in 2021.
# Assume that about 5 years of crime data is sufficient to capture the effect
# of crime on property values if any effect exists.
# cc <- c[as.Date(c$Date) >= as.Date("2016-01-01") &
# as.Date(c$Date) <= as.Date("2021-12-31"),]
# Save filtered data
# st_write(pr, ".//data//parcel_hartford_single_family.geojson")
# st_write(cc, ".//data//crime_hartford_2016_2021.geojson")
# Read in filtered Hartford crime, parcel, and population data
z <- st_read(".//data//crime_hartford_2016_2021.geojson")
## Reading layer `crime_hartford_2016_2021' from data source
## `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\crime_hartford_2016_2021.geojson'
## using driver `GeoJSON'
## Simple feature collection with 197060 features and 12 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -72.71865 ymin: 41.72403 xmax: -72.65041 ymax: 41.80719
## Geodetic CRS: WGS 84
p <- st_read(".//data//parcel_hartford_single_family.geojson")
## Reading layer `parcel_hartford_single_family' from data source
## `C:\Users\llint\OneDrive - Yale University\classes\625\CT Property\SDS625\data\parcel_hartford_single_family.geojson'
## using driver `GeoJSON'
## Simple feature collection with 7239 features and 47 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -72.71637 ymin: 41.72374 xmax: -72.65809 ymax: 41.80743
## Geodetic CRS: WGS 84
pop <- read.csv(".//data//population_by_tract.csv", skip = 3, header = T)[,1:3]
# Get polygons for Hartford's census tracts
hartford_tracts <- st_filter(tracts(state = "CT"),
subset(county_subdivisions(state = "CT"),
NAMELSAD == "Hartford town"),
.predicate = st_within) |>
st_transform(crs = st_crs(z))
## Retrieving data for the year 2021
## Retrieving data for the year 2021
# Get polygons for Hartford's bodies of water
water <- st_intersection(
st_transform(area_water("CT", "Hartford"), crs = st_crs(hartford_tracts)),
hartford_tracts)
## Retrieving data for the year 2021
## Warning: attribute variables are assumed to be spatially constant throughout
## all geometries
# Rename tract columns to distinguish from parcel and crime data
# hartford_tracts <- hartford_tracts %>%
# rename(tract_geometry = geometry, tract_name = NAME, tract_id = GEOID)
# # Extract year from crime date
# c$year <- year(as.Date(c$Date))
# # Join parcel and crime data to census tracts (keep tract geometry)
# tp <- st_join(hartford_tracts, p)
# tc <- st_join(hartford_tracts, c)
# # Extract short tract name from in population data to match tract polygons
# pop$tract_name <- gsub("[^0-9.]", "", pop$Census.Tract)
# # Calculate average housing price by census tract
# ap <- tp %>%
# group_by(tract_name) %>%
# summarise(avg_house_value = mean(Assessed_Total, na.rm = TRUE))
#
# # Calculate crime rate by census tract and year
# at <- tc[1:1000,] %>%
# group_by(tract_name, year) %>%
# summarise(crime_count = n()) %>%
# left_join(pop, by = "tract_name") %>%
# mutate(crime_rate_per_1000 = crime_count / Estimated.Population * 1000) %>%
# select(tract_name, year, crime_rate_per_1000) %>%
# pivot_wider(names_from = year, values_from = crime_rate_per_1000) %>%
# left_join(ap, by = "tract_name") %>%
# select(tract_name, everything())
#
# write.csv(".//data//crime_rate_by_tract.csv", row.names = FALSE)
#
# tc
The filters applied to the data are:
The manually curated variables are:
Price: Assessed total value of the parcelThefts: Number of thefts (robberies, burglaries,
larceny, theft, or stolen property) within 0.1 miles of the parcelViolence: Number of violent crimes (assaults,
homicides, shootings) within 0.1 miles of the parcelLiving_Area: Living area of the parcelEffective_Area: Effective area of the parcelYear: Approximate year the parcel was builtBed: Number of bedrooms in the parcelBath: Number of bathrooms in the parcelThe highly correlated numeric covariates are
There appear to be more thefts and violent crimes near single family
homes in dilapidated condition. Homes in worse condition
tend to be older, but there are also older homes in good condition.
There is not a clear relationship between the year the parcel was built and its condition.
All covariates appear to correlate with the response variable,
Assessed_Total.
# install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Filter major property crimes and violent crimes
stealin <- z[grep("ROBBERY|BURGLARY|LARCENY|THEFT",
z$UCR_1_Category), ]
hurtin <- z[grep("ASSAULT|HOMICIDE|SHOOTING", z$UCR_1_Category), ]
# Get number of thefts and violent crimes within 0.1 miles of each parcel
p$thefts <- sapply(
st_is_within_distance(p, stealin, dist = units::set_units(0.1, "mi")),
length)
p$violence <- sapply(
st_is_within_distance(p, hurtin, dist = units::set_units(0.1, "mi")),
length)
# Re-order condition description of parcels
p$Condition_Description <- factor(
as.character(p$Condition_Description),
levels = c("Dilapidated", "Very Poor", "Poor", "Fair", "Fair-Avg", "Average",
"Avg-Good", "Good", "Good-VG", "Very Good", "Excellent"))
# Select predictors and filter out NA's
X <- p[, c("OBJECTID", "Assessed_Total", "thefts", "violence", "Living_Area",
"Effective_Area", "AYB", "Number_of_Bedroom", "Number_of_Baths",
"Condition_Description")] %>%
rename(Price = Assessed_Total, Thefts = thefts, Violence = violence,
Year = AYB, Bed = Number_of_Bedroom, Bath = Number_of_Baths,
Condition = Condition_Description) %>%
na.omit()
# Check that we haven't dropped too many rows
nrow(p) # 7239 rows
## [1] 7239
nrow(X) # 7220 rows
## [1] 7220
# Plot pairwise correlation between numeric predictors
ggpairs(X, columns = c(2:9), lower = list(continuous = "points")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot boxplots of numeric predictors with respect to condition description
# Pivot data longer so that we can facet by variable
LX <- X %>%
pivot_longer(cols = c(Thefts, Violence, Living_Area, Effective_Area,
Year, Bed, Bath, Price),
names_to = "variable", values_to = "value")
ggplot(data = LX, aes(x = variable, y = value)) +
geom_boxplot(aes(fill = Condition), position = position_dodge(1)) +
facet_wrap( ~ variable, scales = "free") +
theme(axis.title.x = element_blank(), axis.text.x = element_blank(),
axis.ticks.x = element_blank(), panel.grid.major.x = element_blank(),
axis.title.y = element_blank())
Variable Selection. We leave out Thefts since
it is highly correlated with Violence but has a lower
correlation with Price than Violence does. For
similar reasons, we leave out Effective_Area and keep
Living_Area. We keep both Bed and
Bath for now.
Transformations. From the correlation plots, we can see that
Price, Living_Area, and Violence
are right-skewed. To adjust for this, we take the log of
Price and the square root of Living_Area and
Violence.
# Fit linear models with no transformations
m0 <- lm(Price ~ Violence + Living_Area + Year + Bed + Bath + Condition,
data = X)
summary(m0)
##
## Call:
## lm(formula = Price ~ Violence + Living_Area + Year + Bed + Bath +
## Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74690 -7114 855 6837 110641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.440e+05 1.437e+04 -16.975 < 2e-16 ***
## Violence -2.269e+02 6.194e+00 -36.632 < 2e-16 ***
## Living_Area 3.349e+01 3.247e-01 103.138 < 2e-16 ***
## Year 1.122e+02 6.642e+00 16.890 < 2e-16 ***
## Bed -2.306e+03 2.287e+02 -10.086 < 2e-16 ***
## Bath 5.429e+03 3.547e+02 15.303 < 2e-16 ***
## ConditionVery Poor 7.343e+02 8.675e+03 0.085 0.9325
## ConditionPoor 1.342e+04 6.797e+03 1.974 0.0484 *
## ConditionFair 2.704e+04 6.392e+03 4.230 2.36e-05 ***
## ConditionFair-Avg 3.316e+04 6.243e+03 5.312 1.12e-07 ***
## ConditionAverage 4.205e+04 6.147e+03 6.841 8.51e-12 ***
## ConditionAvg-Good 4.959e+04 6.142e+03 8.073 7.97e-16 ***
## ConditionGood 5.046e+04 6.142e+03 8.215 2.49e-16 ***
## ConditionGood-VG 5.461e+04 6.161e+03 8.864 < 2e-16 ***
## ConditionVery Good 5.723e+04 6.164e+03 9.284 < 2e-16 ***
## ConditionExcellent 6.476e+04 6.314e+03 10.257 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13710 on 7204 degrees of freedom
## Multiple R-squared: 0.826, Adjusted R-squared: 0.8256
## F-statistic: 2280 on 15 and 7204 DF, p-value: < 2.2e-16
# Plot diagnostics
par(mfrow = c(2, 2))
plot(m0)
# Fit linear models with transformations
m1 <- lm(log(Price) ~ sqrt(Violence) + sqrt(Living_Area) + Year +
Bed + Bath + Condition,
data = X)
summary(m1)
##
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + sqrt(Living_Area) +
## Year + Bed + Bath + Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26043 -0.07630 0.01616 0.09684 0.72052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.902e+00 1.769e-01 39.017 < 2e-16 ***
## sqrt(Violence) -3.826e-02 8.649e-04 -44.240 < 2e-16 ***
## sqrt(Living_Area) 2.827e-02 3.796e-04 74.452 < 2e-16 ***
## Year 9.011e-04 8.068e-05 11.170 < 2e-16 ***
## Bed -9.815e-03 2.813e-03 -3.489 0.000488 ***
## Bath 4.065e-02 4.154e-03 9.786 < 2e-16 ***
## ConditionVery Poor 7.608e-01 1.043e-01 7.294 3.33e-13 ***
## ConditionPoor 9.349e-01 8.171e-02 11.441 < 2e-16 ***
## ConditionFair 1.071e+00 7.686e-02 13.937 < 2e-16 ***
## ConditionFair-Avg 1.136e+00 7.507e-02 15.138 < 2e-16 ***
## ConditionAverage 1.381e+00 7.392e-02 18.684 < 2e-16 ***
## ConditionAvg-Good 1.503e+00 7.386e-02 20.354 < 2e-16 ***
## ConditionGood 1.520e+00 7.386e-02 20.587 < 2e-16 ***
## ConditionGood-VG 1.555e+00 7.409e-02 20.989 < 2e-16 ***
## ConditionVery Good 1.583e+00 7.412e-02 21.353 < 2e-16 ***
## ConditionExcellent 1.654e+00 7.592e-02 21.779 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1648 on 7204 degrees of freedom
## Multiple R-squared: 0.763, Adjusted R-squared: 0.7625
## F-statistic: 1546 on 15 and 7204 DF, p-value: < 2.2e-16
plot(m1)
There are many parcels beyond 4 standard errors below the regression line. This means the model is severely overestimating the value of these parcels. Let’s find out what these parcels are.
Average condition, although the majority of them are
Average or worse.Solution. Refer to the previous correlation plots and
observe that Price and Living Area appear to have a strongly correlated
linear correlation. However, we performed a log
transformation on the Price variable but a square-root transformation on
the Living Area variable. This asymmetry may be the cause of the large
residuals.
# Investigate the large negative residuals
r <- residuals(m1) / summary(m1)$sigma # standard residuals
o <- X[r < -4, ]
o$StdResid <- r[r < -4]
nrow(o)
## [1] 31
# Plot the offending parcels geographically
g <- ggplot(o) +
geom_sf(data = hartford_tracts) +
geom_sf(data = o) +
stat_sf_coordinates(size = 1, color = "red") +
labs(x = "Latitude", y = "Longitude") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations
give.n <- function(x){
return(data.frame(
y = quantile(x, .75) + 0.1,
label = paste0("n = ", length(x))
))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) +
geom_boxplot() +
stat_summary(fun.data = give.n, geom = "text", fun.y = median) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_bw()
# Plot residuals vs transformed numeric covariates
o <- o %>%
mutate(logPrice = log(Price),
sqrtViolence = sqrt(Violence),
sqrtLiving_Area = sqrt(Living_Area))
ggpairs(o, columns = c(12:15,7:9), lower = list(continuous = "points"),
progress = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Let’s test our hypothesis.
Living_Area, the residuals are
more symmetrically distributed. We may have sacrificed some upper tail
normality for the lower tail, since there appear to be more residuals
larger than 4 now. However, we have reduced the total number of
residuals beyond 4 SE’s from 33 to 21.# Take log of living area
m2 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year +
Bed + Bath + Condition,
data = X)
summary(m2)
##
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) +
## Year + Bed + Bath + Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26663 -0.08683 0.01414 0.10361 1.09631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.1877985 0.2012699 20.807 < 2e-16 ***
## sqrt(Violence) -0.0414237 0.0008987 -46.094 < 2e-16 ***
## log(Living_Area) 0.5556087 0.0082575 67.286 < 2e-16 ***
## Year 0.0007582 0.0000839 9.036 < 2e-16 ***
## Bed -0.0010192 0.0029147 -0.350 0.727
## Bath 0.0751240 0.0041665 18.031 < 2e-16 ***
## ConditionVery Poor 0.7690082 0.1087200 7.073 1.66e-12 ***
## ConditionPoor 0.9304490 0.0851789 10.923 < 2e-16 ***
## ConditionFair 1.0710818 0.0801190 13.369 < 2e-16 ***
## ConditionFair-Avg 1.1307146 0.0782497 14.450 < 2e-16 ***
## ConditionAverage 1.3693732 0.0770514 17.772 < 2e-16 ***
## ConditionAvg-Good 1.4886232 0.0769892 19.335 < 2e-16 ***
## ConditionGood 1.5035464 0.0769862 19.530 < 2e-16 ***
## ConditionGood-VG 1.5399516 0.0772256 19.941 < 2e-16 ***
## ConditionVery Good 1.5703838 0.0772606 20.326 < 2e-16 ***
## ConditionExcellent 1.6457653 0.0791454 20.794 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1718 on 7204 degrees of freedom
## Multiple R-squared: 0.7425, Adjusted R-squared: 0.7419
## F-statistic: 1385 on 15 and 7204 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(m2)
# Drop number of bedrooms
m3 <- lm(log(Price) ~ sqrt(Violence) + log(Living_Area) + Year +
Bath + Condition,
data = X)
summary(m3)
##
## Call:
## lm(formula = log(Price) ~ sqrt(Violence) + log(Living_Area) +
## Year + Bath + Condition, data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26656 -0.08634 0.01404 0.10379 1.09043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.195e+00 2.002e-01 20.950 < 2e-16 ***
## sqrt(Violence) -4.145e-02 8.947e-04 -46.332 < 2e-16 ***
## log(Living_Area) 5.541e-01 6.987e-03 79.297 < 2e-16 ***
## Year 7.591e-04 8.385e-05 9.053 < 2e-16 ***
## Bath 7.486e-02 4.099e-03 18.266 < 2e-16 ***
## ConditionVery Poor 7.689e-01 1.087e-01 7.073 1.66e-12 ***
## ConditionPoor 9.300e-01 8.516e-02 10.920 < 2e-16 ***
## ConditionFair 1.071e+00 8.010e-02 13.365 < 2e-16 ***
## ConditionFair-Avg 1.130e+00 7.824e-02 14.447 < 2e-16 ***
## ConditionAverage 1.369e+00 7.704e-02 17.770 < 2e-16 ***
## ConditionAvg-Good 1.488e+00 7.697e-02 19.333 < 2e-16 ***
## ConditionGood 1.503e+00 7.697e-02 19.528 < 2e-16 ***
## ConditionGood-VG 1.539e+00 7.721e-02 19.939 < 2e-16 ***
## ConditionVery Good 1.570e+00 7.724e-02 20.324 < 2e-16 ***
## ConditionExcellent 1.645e+00 7.913e-02 20.793 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1718 on 7205 degrees of freedom
## Multiple R-squared: 0.7425, Adjusted R-squared: 0.742
## F-statistic: 1484 on 14 and 7205 DF, p-value: < 2.2e-16
plot(m3)
# Compare residuals beyond 4 SE's
r1 <- residuals(m1) / summary(m1)$sigma
sum(r1 < -4 | r1 > 4)
## [1] 33
r3 <- residuals(m3) / summary(m3)$sigma
sum(r3 < -4 | r3 > 4)
## [1] 21
Repeating the same residuals analysis from before, we see that there are no longer any strongly correlated covariates with the residuals.
# Investigate the large negative residuals
r <- residuals(m3) / summary(m3)$sigma # standard residuals
o <- X[r < -4, ]
o$StdResid <- r[r < -4]
nrow(o)
## [1] 17
# Plot the offending parcels geographically
g <- ggplot(o) +
geom_sf(data = hartford_tracts) +
geom_sf(data = o) +
stat_sf_coordinates(size = 1, color = "red") +
labs(x = "Latitude", y = "Longitude") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
toWebGL(ggplotly(g))
# Plot residuals vs. Condition
# Function for number of observations
give.n <- function(x){
return(data.frame(
y = quantile(x, .75) + 0.1,
label = paste0("n = ", length(x))
))
}
ggplot(data = o, aes(x = Condition, y = StdResid)) +
geom_boxplot() +
stat_summary(fun.data = give.n, geom = "text", fun.y = median) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_bw()
# Plot residuals vs transformed numeric covariates
o <- o %>%
mutate(logPrice = log(Price),
sqrtViolence = sqrt(Violence),
logLiving_Area = log(Living_Area))
ggpairs(o, columns = c(12:15,7,9), lower = list(continuous = "points"),
progress = F) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# library(glmnet)
# # Format predictors and response variable from parcel data
# X <- p %>%
# dplyr::select(Condition_Description,
# AYB, Living_Area, Effective_Area,
# Total_Rooms, Number_of_Bedroom, Number_of_Baths,
# thefts, violence) %>%
# # mutate(sqrt_Living_Area = sqrt(Living_Area),
# # sqrt_Effective_Area = sqrt(Effective_Area),
# # thefts1 = thefts + 1,
# # thefts2 = thefts*2) %>%
# sf::st_drop_geometry()
# X <- data.matrix(X)
# y <- p$Assessed_Total
# # Remove rows with missing values
# w <- complete.cases(X, y)
# X <- X[w, ]
# y <- y[w]
# # Select variables
# # Run lasso regression
# cv_tune.lasso_model = suppressMessages(suppressWarnings(
# cv.glmnet(x = X,
# y = y,
# nlambda = 1000,
# nfolds = 500,
# pmax = 15,
# parallel = TRUE)))
#
# plot(cv_tune.lasso_model)
#
# lasso_modelmin = glmnet(x = X, y = y, lambda = cv_tune.lasso_model$lambda.min)
# lasso_model1se = glmnet(x = X, y = y, lambda = cv_tune.lasso_model$lambda.1se)
#
# # Does not select Total_Rooms
# coef(lasso_modelmin)
# # Does not select Total_Rooms, Number_of_Bedroom, thefts
# coef(lasso_model1se)
#
# vars_keepmin = rownames(coef(lasso_modelmin))[
# which(as.matrix(coef(lasso_modelmin)) != 0)][-1]
# vars_keep1se = rownames(coef(lasso_model1se))[
# which(as.matrix(coef(lasso_model1se)) != 0)][-1]
# # Calculate centroids of parcels and crime
# centroids <- st_centroid(p)
# coords <- st_coordinates(centroids)
# p$centroid.x <- coords[, 'X']
# p$centroid.y <- coords[, 'Y']
#
# centroids <- st_centroid(c)
# coords <- st_coordinates(centroids)
# c$centroid.x <- coords[, 'X']
# c$centroid.y <- coords[, 'Y']
#
#
# # Filter property crimes and violent crimes
# stealin <- c[grep("ROBBERY|BURGLARY|LARCENY|THEFT|STOLEN",
# c$UCR_1_Category), ]
# hurtin <- c[grep("ASSAULT|HOMICIDE|SHOOTING", c$UCR_1_Category), ]
#
# # Get number of thefts and violent crimes within 0.1 miles of each parcel
# p$thefts <- sapply(
# st_is_within_distance(p, stealin, dist = units::set_units(0.1, "mi")),
# length)
# p$violence <- sapply(
# st_is_within_distance(p, hurtin, dist = units::set_units(0.1, "mi")),
# length)
#
# st_within(p$geometry[1], p$geometry[1])
# st_within(p$geometry[1], hartford_tracts[1])
# # For each census tract, get average property value in 2021 and
# # average annual crime count
# # Create a new dataframe
# library(dplyr)
# # Add a column for year
# c <- c %>%
# mutate(year = year(as.Date(Date)))
# # Drop geometry data
# cc <- sf::st_drop_geometry(c)
# pp <- sf::st_drop_geometry(p)
#
# # Calculate total crimes per year
# crimes_per_year <- table(cc$year)
#
#
# select(year) %>%
# group_by(year) %>%
# summarise(total_crime_per_yr = n())
#
#
# # Aggregate crime to census level to get crime rate
# # Plot census tracts colored by crime rates and sized by property values
# l1 = leaflet(c_tract) %>%
#
# addProviderTiles('CartoDB.Positron') %>%
#
# ## census tracts
# addPolygons(fillColor = ~pal1(rescaled.house.value),
# label = ~label %>% lapply(htmltools::HTML),
# weight = 0.5,
# color = 'black',
# fillOpacity = 0.8) %>%
#
# # stops
# addCircleMarkers(data = ds,
# lng = ~stop1_lon,
# lat = ~stop1_lat,
# label = ~label %>% lapply(htmltools::HTML),
# color = pubred,
# radius = ~log(`n_to_Grand Central` + `n_to_New Haven`)) %>%
#
# setView(lng = -73.3,
# lat = 41.21979,
# zoom = 10) %>%
#
# addTiles()
# # Plot sample of crimes and parcels
#
# # library(mapview)
# # mapview(dplyr::sample_n(c, 1e3), dplyr::sample_n(p, 1e3))
#
# g <- ggplot(dplyr::sample_n(c, 1e3)) +
# geom_sf(data = hartford_tracts) +
# geom_sf(data = dplyr::sample_n(p, 1e3)) +
# geom_density_2d(aes(X,Y), data = ~cbind(.x, st_coordinates(.x))) +
# stat_sf_coordinates(size = 0.1, color = "red") +
# labs(x = "Latitude", y = "Longitude") +
# theme_bw()
#
# p <- toWebGL(ggplotly(g))
# p$x$data[[4]]$hoverinfo <- "none"
# p
# # Plot average residential parcel value by census tract
# # with crime counts binned by area
#
# # Map parcel address to
#
#
# # Get average residential parcel value by census tract
# parcel_dg$Zone
#
# hartford_tracts
#
# parcel_dg$avg_residential_value <- parcel_dg$AV_LAND / parcel_dg$AV_TOTAL
Spatial regression
https://oerstatistics.wordpress.com/wp-content/uploads/2016/03/intro_to_r.pdf#page=68.08
https://crd230.github.io/lab8.html
Kernel density estimation